options(warn = -1)
library(reticulate)
library(kableExtra)
library(knitr)
library(phyloseq)
library(microbiome)
library(philr)
library(ape)
library(metacoder)
library("data.table")
library(vegan)
library(tidyr)
library("plyr")
library(gridExtra)
library(randomcoloR)
library(DESeq2)
folder <- '/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Acid_rain_recovery/'
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import numpy as np
import os
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
import matplotlib as mpl
from matplotlib.lines import Line2D
from matplotlib_venn import venn2
import csv
from matplotlib.patches import Patch
from matplotlib import pyplot
import pickle
from scipy.spatial import distance
from scipy import stats
from sklearn import manifold
from sklearn.decomposition import PCA
from scipy.cluster import hierarchy
import scipy.spatial.distance as ssd
folder = '/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Acid_rain_recovery/'
This is something like what Andre would have done to process the samples prior to giving the data to you, which is based on the standard workflow here. After most steps there is a summarise option, which I would look at as it will likely give a good indication of whether something has gone wrong or not. If you see a large jump in the number of samples or number of features (ASVs) from one step to the next then this would be good to look into.
This requires that QIIME2 is installed (directions for that here). It can be installed locally, but with larger datasets it is likely to run out of memory if you have less than ~32GB RAM. You should follow the instructions on the website as this will have the most recent version, but will look something like this (run in terminal):
wget https://repo.anaconda.com/archive/Anaconda3-2019.10-Linux-x86_64.sh
chmod +x Anaconda3-2019.10-Linux-x86_64.sh
./ Anaconda3-2019.10-Linux-x86_64.sh
conda update conda
conda install wget
wget https://data.qiime2.org/distro/core/qiime2-2020.2-py36-linux-conda.yml
conda env create -n qiime2-2020.2 --file qiime2-2020.2-py36-linux-conda.yml
rm qiime2-2020.2-py36-linux-conda.yml
Acitvate the environment:
conda activate qiime2-2020.2
This will give you a .html file showing basic quality metrics for all of the sequences in each of your files.
mkdir fastqc
fastqc -t 4 path_to_folder/*.fastq.gz -o path_to_output_folder
multiqc path_to_output_folder
Read in your samples to QIIME2:
qiime tools import \
--type SampleData[PairedEndSequencesWithQuality] \
--input-path path_to_fastq_files/ \
--output-path reads.qza \
--input-format CasavaOneEightSingleLanePerSampleDirFmt
Summarise the samples:
qiime demux summarize \
--i-data reads.qza \
--o-visualization summary_reads.qzv
Remove the primers from the ends of reads (you will need to make sure that the sequences match the primers you have used - they will be on the IMR website):
qiime cutadapt trim-paired \
--i-demultiplexed-sequences reads.qza \
--p-cores 8 \
--p-front-f GTGYCAGCMGCCGCGGTAA \
--p-front-r CCGYCAATTYMTTTRAGTTT \
--p-discard-untrimmed \
--p-no-indels \
--o-trimmed-sequences trimmed_reads.qza
Summarise:
qiime demux summarize \
--i-data trimmed_reads.qza \
--o-visualization summary_trimmed_reads.qzv
Here you can choose whether to run DADA2 or Deblur. These are two different algorithms for denoising the reads - DADA2 must be run only with samples from the same MiSeq run as it looks at a subset of your samples to learn the error profile and correct the reads. Deblur uses a per sample approach and you therefore don’t need to know whether the samples were run on the same MiSeq run or not. The results from each will probably be very similar.
The –p-trunc-len here need to be edited based on the quality profiles of your reads (you can see them in the summary_trimmed_reads.qzv above). Typically you choose to trim at the point in the reads where the median quality score drops below ~25-30. This always starts sooner on the reverse read. Also keep in mind that your reads must still overlap by enough that they can be joined together. The max-ee options corresponding to the number of errors that you accept in each of the reads.
mkdir dada2_out
qiime dada2 denoise-paired \
--i-demultiplexed-seqs trimmed_reads.qza \
--p-trunc-len-f 260 \
--p-trunc-len-r 160 \
--p-max-ee-f 2 \
--p-max-ee-r 2 \
--p-n-threads 8 \
--o-table dada2_out/table.qza \
--o-representative-sequences dada2_out/representative_sequences.qza \
--o-denoising-stats dada2_out/stats.qza
Summarise:
qiime metadata tabulate \
--m-input-file dada2_out/stats.qza \
--o-visualization stats_dada2_out.qzv
qiime feature-table summarize \
--i-table dada2_out/table.qza \
--o-visualization summary_dada2_out.qzv
Join paired end reads:
qiime vsearch join-pairs \
--i-demultiplexed-seqs reads_trimmed.qza \
--o-joined-sequences reads_joined.qza
Summarise:
qiime demux summarize \
--i-data reads_joined.qza \
--o-visualization summary_reads_joined.qzv
Filter low quality reads:
qiime quality-filter q-score-joined \
--i-demux reads_joined.qza \
--o-filter-stats filt_stats.qza \
--o-filtered-sequences reads_joined_filtered.qza
Summarise:
qiime demux summarize \
--i-data reads_joined_filtered.qza \
--o-visualization summary_reads_joined_filtered.qzv
Here you should decide on the length to keep after joining the reads, so looking at where the median quality score drops below ~25-30. Run deblur:
qiime deblur denoise-16S \
--i-demultiplexed-seqs reads_joined_filtered.qza \
--p-trim-length 402 \
--p-left-trim-len 0 \
--p-sample-stats \
--p-jobs-to-start 12 \
--p-min-reads 1 \
--output-dir deblur_output
Summarise:
qiime feature-table summarize \
--i-table deblur_output_quality/table.qza \
--o-visualization deblur_table_summary.qzv
qiime feature-table merge \
--i-tables dada2_out1/table.qza \
--i-tables dada2_out2/table.qza \
--i-tables dada2_out3/table.qza \
--o-merged-table merged_table.qza
qiime feature-table merge-seqs \
--i-data dada2_out1/representative_sequences.qza \
--i-data dada2_out2/representative_sequences.qza \
--i-data dada2_out3/representative_sequences.qza \
--o-merged-data merged_representative_sequences.qza
Summarise:
qiime feature-table summarize \
--i-table merged_table.qza \
--o-visualization merged_table_summary.qzv
Classify each of your ASVs (inferred by either DADA2 or Deblur above) to find out what their taxonomy is. This is the step that is likely to use the most memory. If it is a problem you may be able to get around it by filtering out very low abundance - typically where e.g. only 1 read is found in all samples, or less than 10 reads overall.
Can update classifier/download for the correct region here
qiime feature-classifier classify-sklearn \
--i-reads merged_representative_sequences.qza \
--i-classifier /home/shared/taxa_classifiers/qiime2-2020.2_classifiers/classifier_silva_132_99_16S_V4.V5_515F_926R.qza \
--p-n-jobs 8 \
--output-dir taxa
Export:
qiime tools export \
--input-path taxa/classification.qza \
--output-path taxa
This step removes very low abundance taxa (assuming you didn’t do it already above) and also taxa that weren’t classified at the phylum/domain level or were classified as mitochondria or chloroplasts. I usually either use a minimum frequency of 10, or of 0.1% of the average frequency per sample (which is what 28 represents).
qiime feature-table filter-features \
--i-table merged_table.qza \
--p-min-frequency 28 \
--p-min-samples 1 \
--o-filtered-table merged_table_filtered.qza
qiime taxa filter-table \
--i-table merged_table_filtered.qza \
--i-taxonomy taxa/classification.qza \
--p-include d__ \
--p-exclude mitochondria,chloroplast \
--o-filtered-table merged_table_filtered_contamination.qza
Summarise:
qiime feature-table summarize \
--i-table merged_table_filtered_contamination.qza \
--o-visualization summary_merged_table_filtered_contamination.qzv
This gives a good idea as to whether your samples are sequenced to sufficient depth. –p-max-depth should be edited to be the maximum number of features found in your samples (from the most recent summary).
qiime diversity alpha-rarefaction \
--i-table merged_table_filtered_contamination.qza \
--p-max-depth 99645 \
--p-steps 20 \
--p-metrics 'observed_otus' \
--o-visualization merged_rarefaction_curves.qzv
Typically, you will now remove samples that have below a certain number of sequences. You should be able to decide this based on where the rarefaction curves start to plateau. If you don’t see a plateau then this suggests that your samples weren’t sequenced to a sufficient depth to capture all of the diversity present. Rarefaction is optional and some people think it should always be done whereas others think it should never be done. Rarefaction is basically a way of dealing with the fact that your samples are not sequenced equally, and there is no way to know whether this is because there were really fewer organisms in each sample, or this is due to some other analysis step (DNA extraction was less efficient, for example). So generally one of several methods for normalisation will be used: - conversion to relative abundance - rarefaction (resampling of each sample to the same number of sequences - the curves above show how many taxa you have by resampling each sample to a range of different depths) - conversion to log ratios There are several studies suggesting that conversion to log ratios should always be used -because it is best at treating the abundances of organisms in samples as compositions (reflecting that you don’t have an absolute measurement of the number of organisms in your sample)- but rarefaction is still the most effective method if the number of sequences per sample is very variable (I would personally define this as more than a magnitude of difference between the smallest and largest depths). So, if you choose to rarefy, you should rarefy to the same number of reads as the smallest sample remaining after removing those with very low numbers of reads.
qiime feature-table filter-samples \
--i-table merged_table_filtered_contamination.qza \
--p-min-frequency 5000 \
--o-filtered-table merged_table_final.qza
###
#optional#
qiime feature-table rarefy \
--i-table merged_table_final.qza \
--p-sampling-depth 5000 \
--o-rarefied-table merged_table_final_rarefied.qza
###
qiime feature-table filter-seqs \
--i-data merged_representative_sequences.qza \
--i-table merged_table_final.qza \
--o-filtered-data representative_sequences_final.qza
This gives you a phylogenetic tree of your sequences based on all of the sequences in the Silva v128 database, which is typically more accurate that constructing a de novo tree.
qiime fragment-insertion sepp \
--i-representative-sequences representative_sequences_final.qza \
--i-reference-database ref_alignments/sepp-refs-silva-128.qza \
--o-tree insertion_tree_rarefied.qza \
--o-placements insertion_placements.qza \
--p-threads 8
qiime tools export \
--input-path representative_sequences_final.qza \
--output-path exports
sed -i -e '1 s/Feature/#Feature/' -e '1 s/Taxon/taxonomy/' taxa/taxonomy.tsv
qiime tools export \
--input-path merged_table_final.qza \
--output-path exports
biom add-metadata \
-i exports/feature-table.biom \
-o exports/feature-table_w_tax.biom \
--observation-metadata-fp taxa/taxonomy.tsv \
--sc-separated taxonomy
biom convert \
-i exports/feature-table_w_tax.biom \
-o exports/feature-table_w_tax.txt \
--to-tsv \
--header-key taxonomy
qiime tools export \
--input-path insertion_tree.qza \
--output-path exports
Here you’ll need to also input your metadata table, which at a minimum just needs a #SampleID column where the sample names given match the first part of the fasta files that you input for each sample. The other columns can be named whatever you like (although there are sometimes issues with certain characters so it’s easier to have all letters/numbers), and you can have as many columns/groups as you like.
qiime taxa barplot \
--i-table merged_table_final.qza \
--i-taxonomy taxa/classification.qza \
--m-metadata-file metadata.txt \
--o-visualization taxa/taxa_barplot.qzv
This will give you a range of PCoA plots etc. I suggest using the same sampling depth as given previously for filtering/rarefying samples. It will automatically rarefy all samples to this depth.
qiime diversity core-metrics-phylogenetic \
--i-table merged_table_final.qza \
--i-phylogeny insertion_tree.qza \
--p-sampling-depth 5000 \
--m-metadata-file metadata.txt \
--p-n-jobs 4 \
--output-dir diversity
I have mainly used Python for analysis in the past because this is what I already knew and was comfortable with before analysing any microbiome data, but there are many more packages and tutorials available for microbiome analysis in R (for example, unifrac can’t be calculated in Python). I now often use a combination of R and Python because this is relatively easy to do in R notebooks, but here I’ll give a typical workflow of what I would usually do in Python and then also doing most of this in only R. This website looks like it has a nice tutorial for analysis in R, as well as links to other resources. This website allows you to upload your data and do analyses this way, which can be great for getting results quickly, but obviously won’t allow you to customise analysis very easily.
Only do this once (remove empty rows from file):
ft = pd.read_csv(folder+'exports/feature-table_w_tax.txt', sep='\t', header=2, index_col=0)
ft.dropna(inplace=True)
ft.to_csv(folder+'/exports/feature_table.csv')
Now read in the data and I usually first sort the taxonomy so that I can easily get the name of an ASV at whatever phylogenetic level I want later on:
ft = pd.read_csv(folder+'exports/feature_table.csv', header=0, index_col=0)
md = pd.read_csv(folder+'exports/metadata.txt', sep='\t', header=0, index_col=0)
tax_dict = {}
tax_mat = ft.loc[:, ['taxonomy']]
levels = ["Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
for level in levels:
tax_mat[level] = ''
for asv in ft.index.values:
tax = ft.loc[asv, 'taxonomy']
tax = tax.split(';')
for t in range(len(tax)):
tax[t] = tax[t].lstrip()
tax_mat.loc[asv, levels[t]] = tax[t]
tax_dict[asv] = tax
ft.drop(['taxonomy'], axis=1, inplace=True)
tax_mat.drop(['taxonomy'], axis=1, inplace=True)
tax_mat = tax_mat.reset_index()
tax_mat = tax_mat.rename(columns={'#OTU ID':'OTUID'})
ft.to_csv(folder+'exports/feature_table_no_tax.csv')
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
fig = plt.figure(figsize=(9,6))
ax1 = plt.subplot(211)
ft_sum = pd.DataFrame(ft.sum(axis=0))
labels = sorted(ft_sum.index.values)
ft_sum = ft_sum.loc[labels, :]
ft_sum.plot.bar(ax=ax1, legend=False)
plt.ylabel('Number of reads')
plt.show()
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
def boxplot(ax, ft_plot, md_var, ylab='', title=''):
rename = {}
for sample in md_var.index.values:
rename[sample] = md_var.loc[sample, :].values[0]
ft_plot = ft_plot.rename(index=rename).rename(columns={0:'Sum'}).reset_index()
ft_plot.boxplot(ax=ax, by='index', column='Sum', grid=False)
plt.sca(ax)
if ax != ax1: plt.yticks([])
else: plt.ylabel(ylab)
plt.xticks(rotation=90)
plt.xlabel('')
plt.title(title)
return
fig = plt.figure(figsize=(9,6))
ax1, ax2, ax3 = plt.subplot(231), plt.subplot(232), plt.subplot(233)
axes = [ax1, ax2, ax3]
for a in range(3):
group = md.columns[a]
boxplot(axes[a], ft_sum, pd.DataFrame(md.loc[:, group]), 'Number of reads', group)
fig.suptitle('')
plt.show()
def get_diversity(diversity, sample):
'''
function to calculate a range of different diversity metrics
It takes as input:
- diversity (the name of the diversity metric we want, can be 'Simpsons', 'Shannon', 'Richness', 'Evenness', 'Maximum' (Maximum is not a diversity metric, the function will just return the maximum abundance value given in sample)
- sample (a list of abundance values that should correspond to one sample)
Returns:
- The diversity index for the individual sample
'''
for a in range(len(sample)):
sample[a] = float(sample[a])
total = sum(sample)
if diversity == 'Simpsons':
for b in range(len(sample)):
sample[b] = (sample[b]/total)**2
simpsons = 1-(sum(sample))
return simpsons
elif diversity == 'Shannon':
for b in range(len(sample)):
sample[b] = (sample[b]/total)
if sample[b] != 0:
sample[b] = -(sample[b] * (np.log(sample[b])))
shannon = sum(sample)
return shannon
elif diversity == 'Richness':
rich = 0
for b in range(len(sample)):
if sample[b] != 0:
rich += 1
return rich
elif diversity == 'Evenness':
for b in range(len(sample)):
sample[b] = (sample[b]/total)
if sample[b] != 0:
sample[b] = -(sample[b] * (np.log(sample[b])))
shannon = sum(sample)
rich = 0
for b in range(len(sample)):
if sample[b] != 0:
rich += 1
even = shannon/(np.log(rich))
return even
elif diversity == 'Maximum':
ma = (max(sample)/total)*100
return ma
return
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
fig = plt.figure(figsize=(9,6))
ax1 = plt.subplot(211)
shannon = [get_diversity('Shannon', ft.loc[:, sample].values) for sample in ft.columns]
ft_shannon = pd.DataFrame([shannon], columns=ft.columns).transpose()
labels = sorted(ft_shannon.index.values)
ft_shannon = ft_shannon.loc[labels, :]
ft_shannon.plot.bar(ax=ax1, legend=False)
plt.ylabel('Shannon diversity')
plt.show()
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
fig = plt.figure(figsize=(9,6))
ax1, ax2, ax3 = plt.subplot(231), plt.subplot(232), plt.subplot(233)
axes = [ax1, ax2, ax3]
for a in range(3):
group = md.columns[a]
boxplot(axes[a], ft_shannon, pd.DataFrame(md.loc[:, group]), 'Shannon diversity', group)
fig.suptitle('')
plt.show()
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
fig = plt.figure(figsize=(9,6))
ax1 = plt.subplot(211)
simpsons = [get_diversity('Simpsons', ft.loc[:, sample].values) for sample in ft.columns]
ft_simpsons = pd.DataFrame([simpsons], columns=ft.columns).transpose()
labels = sorted(ft_simpsons.index.values)
ft_simpsons = ft_simpsons.loc[labels, :]
ft_simpsons.plot.bar(ax=ax1, legend=False)
plt.ylabel('Simpsons diversity')
plt.show()
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
fig = plt.figure(figsize=(9,6))
ax1, ax2, ax3 = plt.subplot(231), plt.subplot(232), plt.subplot(233)
axes = [ax1, ax2, ax3]
for a in range(3):
group = md.columns[a]
boxplot(axes[a], ft_simpsons, pd.DataFrame(md.loc[:, group]), 'Simpsons diversity', group)
fig.suptitle('')
plt.show()
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
fig = plt.figure(figsize=(9,6))
ax1 = plt.subplot(211)
richness = [get_diversity('Richness', ft.loc[:, sample].values) for sample in ft.columns]
ft_richness = pd.DataFrame([richness], columns=ft.columns).transpose()
labels = sorted(ft_richness.index.values)
ft_richness = ft_richness.loc[labels, :]
ft_richness.plot.bar(ax=ax1, legend=False)
plt.ylabel('Richness')
plt.show()
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
fig = plt.figure(figsize=(9,6))
ax1, ax2, ax3 = plt.subplot(231), plt.subplot(232), plt.subplot(233)
axes = [ax1, ax2, ax3]
for a in range(3):
group = md.columns[a]
boxplot(axes[a], ft_richness, pd.DataFrame(md.loc[:, group]), 'Richness', group)
fig.suptitle('')
plt.show()
I’ll now normalise the data in a variety of ways: - convert to relative abundance - rarefy - convert to CLR These bits are the first parts I’ll do some calculations in R, and for the rarefied and relative abundance data I’ll calculate unifrac distances, but for the CLR data I’ll calculate PHILR distance. As I don’t have a tree, I’m just making this now in QIIME2.
Get relative abundance:
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
Convert the tree:
qiime tools export \
--input-path all_files/asvs-tree.qza \
--output-path exports
CLR & PHILR:
#CLR
asv_table <- py$ft
asv_table = as.matrix(asv_table)
#Convert these to phyloseq objects
ASV = otu_table(asv_table, taxa_are_rows = TRUE)
physeq = phyloseq(ASV)
physeq_clr <- microbiome::transform(physeq, "clr")
physeq.clr.df <- as.data.frame(as(otu_table(physeq_clr), "matrix"))
write_phyloseq(physeq_clr, type="all", path=folder)
file.rename(paste(folder, "otu_table.csv", sep=""), paste(folder, "otu_table_clr.csv", sep=""))
#PHILR
taxmat <- py$tax_mat
taxmat2 <- taxmat[,-1]
rownames(taxmat2) <- taxmat[,1]
sampledata <- sample_data(py$md)
colnames(taxmat2) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
phy_tree <- read_tree(paste(folder, "exports/tree.nwk", sep=''))
#Convert these to phyloseq objects
ASV = otu_table(asv_table, taxa_are_rows = TRUE)
TAX = tax_table(taxmat2)
taxa_names(TAX) <- taxmat[,1]
physeq = phyloseq(ASV,phy_tree,TAX, sampledata)
is.rooted(phy_tree(physeq))
is.binary.tree(phy_tree(physeq))
phy_tree(physeq) <- makeNodeLabel(phy_tree(physeq), method="number", prefix='n')
name.balance(phy_tree(physeq), tax_table(physeq), 'n1')
#Add pseudocount
physeq <- transform_sample_counts(physeq, function(x) x+1)
#now the philr part
otu.table <- t(otu_table(physeq))
tree <- phy_tree(physeq)
metadata <- sample_data(physeq)
tax <- tax_table(physeq)
physeq.philr <- philr(otu.table, tree, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt')
physeq.philr.mat = as.matrix(physeq.philr)
#now calculate the distance
physeq.dist <- dist(physeq.philr, method="euclidean")
physeq.dist.mat <- as.matrix(physeq.dist)
write.csv(physeq.dist.mat, paste(folder, "philr_distance.csv", sep=""))
Rarefy:
asv_table <- py$ft
asv_table = as.matrix(asv_table)
phy_tree <- read_tree(paste(folder, "exports/tree.nwk", sep=''))
ASV = otu_table(asv_table, taxa_are_rows = TRUE)
physeq = phyloseq(ASV, phy_tree)
physeq_rare <- rarefy_even_depth(physeq, sample.size = min(sample_sums(physeq)), rngseed = FALSE, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
physeq.rare.df <- as.data.frame(as(otu_table(physeq_rare), "matrix"))
write_phyloseq(physeq_rare, type="all", path=folder)
file.rename(paste(folder, "otu_table.csv", sep=""), paste(folder, "otu_table_rare.csv", sep=""))
Unifrac:
#rarefied
w_uf_rare <- UniFrac(physeq_rare, weighted=TRUE, normalized=FALSE, fast=TRUE)
w_uf_df_rare <- as.data.frame(as.matrix(w_uf_rare))
write.csv(w_uf_df_rare, paste(folder, "/weighted_unifrac_rare.csv", sep=""))
uw_uf_rare <- UniFrac(physeq_rare, weighted=FALSE, normalized=FALSE, fast=TRUE)
uw_uf_df_rare <- as.data.frame(as.matrix(uw_uf_rare))
write.csv(uw_uf_df_rare, paste(folder, "/unweighted_unifrac_rare.csv", sep=""))
#not normalised
w_uf_nn <- UniFrac(physeq, weighted=TRUE, normalized=FALSE, fast=TRUE)
w_uf_df_nn <- as.data.frame(as.matrix(w_uf_nn))
write.csv(w_uf_df_nn, paste(folder, "/weighted_unifrac_abs.csv", sep=""))
uw_uf_nn <- UniFrac(physeq, weighted=FALSE, normalized=FALSE, fast=TRUE)
uw_uf_df_nn <- as.data.frame(as.matrix(uw_uf_nn))
write.csv(uw_uf_df_nn, paste(folder, "/unweighted_unifrac_abs.csv", sep=""))
#relative abundance
asv_table <- py$ft_ra
asv_table = as.matrix(asv_table)
ASV = otu_table(asv_table, taxa_are_rows = TRUE)
physeq_ra = phyloseq(ASV,phy_tree)
w_uf_ra <- UniFrac(physeq_ra, weighted=TRUE, normalized=FALSE, fast=TRUE)
w_uf_df_ra <- as.data.frame(as.matrix(w_uf_ra))
write.csv(w_uf_df_ra, paste(folder, "/weighted_unifrac_ra.csv", sep=""))
uw_uf_ra <- UniFrac(physeq_ra, weighted=FALSE, normalized=FALSE, fast=TRUE)
uw_uf_df_ra <- as.data.frame(as.matrix(uw_uf_ra))
write.csv(uw_uf_df_ra, paste(folder, "/unweighted_unifrac_ra.csv", sep=""))
For some reason R often saves .csv files in a weird format that Python can’t open, if this is a problem, just open them and then save as .csv - this will fix it.
For all of these plots the stress value is above 0.2, which indicates that 2 dimensions probably isn’t enough to represent all of the diversity present.
def transform_for_NMDS(df, dist_met='braycurtis'):
X = df.iloc[0:].values
y = df.iloc[:,0].values
seed = np.random.RandomState(seed=3)
X_true = X
if dist_met != False:
similarities = distance.cdist(X_true, X_true, dist_met)
else:
similarities = df
X_true = similarities.iloc[0:].values
similarities = np.nan_to_num(similarities)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
dissimilarity="precomputed", n_jobs=1)
#print(similarities)
pos = mds.fit(similarities).embedding_
nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
dissimilarity="precomputed", random_state=seed, n_jobs=1,
n_init=1)
npos = nmds.fit_transform(similarities, init=pos)
# Rescale the data
pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
# Rotate the data
clf = PCA()
X_true = clf.fit_transform(X_true)
pos = clf.fit_transform(pos)
npos = clf.fit_transform(npos)
return pos, npos, nmds.stress_
color_group = {'Control':'#1A5276', 'Treatment':'#F1C40F'}
shapes = {'Site1':'v', 'Site2':'o', 'Site3':'^', 'Site4':'s', 'Site5':'P'}
ms = 10
handles_color = [Patch(facecolor=color_group[color], edgecolor='k', label=color) for color in color_group]
handles_shape = [Line2D([0], [0], marker=shapes[shape], color='w', label=shape[:4]+' '+shape[-1], markerfacecolor='k', markersize=ms) for shape in shapes]
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
pos, npos, stress = transform_for_NMDS(ft.transpose())
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(221)
for a in range(len(npos)):
sample = ft.columns[a]
color = color_group[md.loc[sample, 'Treatment']]
shape = shapes[md.loc[sample, 'Location']]
ax1.scatter(npos[a,0], npos[a,1], color=color, marker=shape, edgecolor='k')
plt.xlabel('NMDS1'), plt.ylabel('NMDS2')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress, 3)), transform=ax1.transAxes)
leg1 = ax1.legend(handles=handles_color, title='Treatment', loc='upper left', bbox_to_anchor=(1.05, 1.02))
ax1.legend(handles=handles_shape, title='Site', loc='upper left', bbox_to_anchor=(1.05, 0.84))
ax1.add_artist(leg1)
plt.tight_layout()
plt.show()
pos, npos, stress = transform_for_NMDS(ft_ra.transpose())
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(221)
for a in range(len(npos)):
sample = ft_ra.columns[a]
color = color_group[md.loc[sample, 'Treatment']]
shape = shapes[md.loc[sample, 'Location']]
ax1.scatter(npos[a,0], npos[a,1], color=color, marker=shape, edgecolor='k')
plt.xlabel('NMDS1'), plt.ylabel('NMDS2')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress, 3)), transform=ax1.transAxes)
leg1 = ax1.legend(handles=handles_color, title='Treatment', loc='upper left', bbox_to_anchor=(1.05, 1.02))
ax1.legend(handles=handles_shape, title='Site', loc='upper left', bbox_to_anchor=(1.05, 0.84))
ax1.add_artist(leg1)
plt.tight_layout()
plt.show()
ft_rare = pd.read_csv(folder+'otu_table_rare.csv', header=0, index_col=0)
pos, npos, stress = transform_for_NMDS(ft_rare.transpose())
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(221)
for a in range(len(npos)):
sample = ft_rare.columns[a]
color = color_group[md.loc[sample, 'Treatment']]
shape = shapes[md.loc[sample, 'Location']]
ax1.scatter(npos[a,0], npos[a,1], color=color, marker=shape, edgecolor='k')
plt.xlabel('NMDS1'), plt.ylabel('NMDS2')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress, 3)), transform=ax1.transAxes)
leg1 = ax1.legend(handles=handles_color, title='Treatment', loc='upper left', bbox_to_anchor=(1.05, 1.02))
ax1.legend(handles=handles_shape, title='Site', loc='upper left', bbox_to_anchor=(1.05, 0.84))
ax1.add_artist(leg1)
plt.tight_layout()
plt.show()
w_uf = pd.read_csv(folder+'weighted_unifrac_abs.csv', index_col=0, header=0)
uw_uf = pd.read_csv(folder+'unweighted_unifrac_abs.csv', index_col=0, header=0)
pos_w, npos_w, stress_w = transform_for_NMDS(w_uf, dist_met=False)
pos_uw, npos_uw, stress_uw = transform_for_NMDS(uw_uf, dist_met=False)
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(221)
ax2 = plt.subplot(222)
for a in range(len(npos_w)):
sample = w_uf.columns[a]
color = color_group[md.loc[sample, 'Treatment']]
shape = shapes[md.loc[sample, 'Location']]
ax1.scatter(npos_w[a,0], npos_w[a,1], color=color, marker=shape, edgecolor='k')
ax2.scatter(npos_uw[a,0], npos_uw[a,1], color=color, marker=shape, edgecolor='k')
plt.sca(ax1)
plt.xlabel('NMDS1'), plt.ylabel('NMDS2'), plt.title('Weighted Unifrac')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress_w, 3)), transform=ax1.transAxes)
plt.sca(ax2)
plt.xlabel('NMDS1'), plt.title('Unweighted Unifrac')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress_uw, 3)), transform=ax2.transAxes)
leg1 = ax2.legend(handles=handles_color, title='Treatment', loc='upper left', bbox_to_anchor=(1.05, 1.02))
ax2.legend(handles=handles_shape, title='Site', loc='upper left', bbox_to_anchor=(1.05, 0.84))
ax2.add_artist(leg1)
plt.tight_layout()
plt.show()
w_uf = pd.read_csv(folder+'weighted_unifrac_ra.csv', index_col=0, header=0)
uw_uf = pd.read_csv(folder+'unweighted_unifrac_ra.csv', index_col=0, header=0)
pos_w, npos_w, stress_w = transform_for_NMDS(w_uf, dist_met=False)
pos_uw, npos_uw, stress_uw = transform_for_NMDS(uw_uf, dist_met=False)
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(221)
ax2 = plt.subplot(222)
for a in range(len(npos_w)):
sample = w_uf.columns[a]
color = color_group[md.loc[sample, 'Treatment']]
shape = shapes[md.loc[sample, 'Location']]
ax1.scatter(npos_w[a,0], npos_w[a,1], color=color, marker=shape, edgecolor='k')
ax2.scatter(npos_uw[a,0], npos_uw[a,1], color=color, marker=shape, edgecolor='k')
plt.sca(ax1)
plt.xlabel('NMDS1'), plt.ylabel('NMDS2'), plt.title('Weighted Unifrac')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress_w, 3)), transform=ax1.transAxes)
plt.sca(ax2)
plt.xlabel('NMDS1'), plt.title('Unweighted Unifrac')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress_uw, 3)), transform=ax2.transAxes)
leg1 = ax2.legend(handles=handles_color, title='Treatment', loc='upper left', bbox_to_anchor=(1.05, 1.02))
ax2.legend(handles=handles_shape, title='Site', loc='upper left', bbox_to_anchor=(1.05, 0.84))
ax2.add_artist(leg1)
plt.tight_layout()
plt.show()
w_uf = pd.read_csv(folder+'weighted_unifrac_rare.csv', index_col=0, header=0)
uw_uf = pd.read_csv(folder+'unweighted_unifrac_rare.csv', index_col=0, header=0)
pos_w, npos_w, stress_w = transform_for_NMDS(w_uf, dist_met=False)
pos_uw, npos_uw, stress_uw = transform_for_NMDS(uw_uf, dist_met=False)
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(221)
ax2 = plt.subplot(222)
for a in range(len(npos_w)):
sample = w_uf.columns[a]
color = color_group[md.loc[sample, 'Treatment']]
shape = shapes[md.loc[sample, 'Location']]
ax1.scatter(npos_w[a,0], npos_w[a,1], color=color, marker=shape, edgecolor='k')
ax2.scatter(npos_uw[a,0], npos_uw[a,1], color=color, marker=shape, edgecolor='k')
plt.sca(ax1)
plt.xlabel('NMDS1'), plt.ylabel('NMDS2'), plt.title('Weighted Unifrac')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress_w, 3)), transform=ax1.transAxes)
plt.sca(ax2)
plt.xlabel('NMDS1'), plt.title('Unweighted Unifrac')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress_uw, 3)), transform=ax2.transAxes)
leg1 = ax2.legend(handles=handles_color, title='Treatment', loc='upper left', bbox_to_anchor=(1.05, 1.02))
ax2.legend(handles=handles_shape, title='Site', loc='upper left', bbox_to_anchor=(1.05, 0.84))
ax2.add_artist(leg1)
plt.tight_layout()
plt.show()
ft_clr = pd.read_csv(folder+'otu_table_clr.csv', header=0, index_col=0)
pos, npos, stress = transform_for_NMDS(ft_clr.transpose(), dist_met='euclidean')
color_group = {'Control':'#1A5276', 'Treatment':'#F1C40F'}
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(221)
for a in range(len(npos)):
sample = ft.columns[a]
color = color_group[md.loc[sample, 'Treatment']]
shape = shapes[md.loc[sample, 'Location']]
ax1.scatter(npos[a,0], npos[a,1], color=color, marker=shape, edgecolor='k')
plt.xlabel('NMDS1'), plt.ylabel('NMDS2')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress, 3)), transform=ax1.transAxes)
leg1 = ax1.legend(handles=handles_color, title='Treatment', loc='upper left', bbox_to_anchor=(1.05, 1.02))
ax1.legend(handles=handles_shape, title='Site', loc='upper left', bbox_to_anchor=(1.05, 0.84))
ax1.add_artist(leg1)
plt.tight_layout()
plt.show()
This is based on log ratios but still accounts for phylogeny.
philr = pd.read_csv(folder+'philr_distance.csv', index_col=0, header=0)
pos, npos, stress = transform_for_NMDS(philr, dist_met=False)
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(221)
for a in range(len(npos)):
sample = philr.columns[a]
color = color_group[md.loc[sample, 'Treatment']]
shape = shapes[md.loc[sample, 'Location']]
ax1.scatter(npos[a,0], npos[a,1], color=color, marker=shape, edgecolor='k')
plt.xlabel('NMDS1'), plt.ylabel('NMDS2')
plt.text(0.05, 0.95, 'Stress = '+str(round(stress, 3)), transform=ax1.transAxes)
leg1 = ax1.legend(handles=handles_color, title='Treatment', loc='upper left', bbox_to_anchor=(1.05, 1.02))
ax1.legend(handles=handles_shape, title='Site', loc='upper left', bbox_to_anchor=(1.05, 0.84))
ax1.add_artist(leg1)
plt.tight_layout()
plt.show()
Here we’re just plotting the relative abundance of the (not rarefied) tables. We’ll plot the top 40 taxa by mean relative abundance.
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
colormap_20, colormap_40b, colormap_40c = mpl.cm.get_cmap('tab20', 256), mpl.cm.get_cmap('tab20b', 256), mpl.cm.get_cmap('tab20c', 256)
norm, norm2 = mpl.colors.Normalize(vmin=0, vmax=19), mpl.colors.Normalize(vmin=20, vmax=39)
m1, m2, m3 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap_20), mpl.cm.ScalarMappable(norm=norm, cmap=colormap_40b), mpl.cm.ScalarMappable(norm=norm2, cmap=colormap_40c)
colors_20 = [m1.to_rgba(a) for a in range(20)]
colors_40 = [m2.to_rgba(a) for a in range(20)]+[m3.to_rgba(a) for a in range(20,40)]
def plot_bar(ax, ax_sml, data, rename_dict, level, legend=True):
plt.sca(ax)
rename_level = {}
for asv in rename_dict:
tax = rename_dict[asv]
if len(tax) < (level+1):
rename_level[asv] = tax[-1]
else:
rename_level[asv] = tax[level]
data = data.rename(index=rename_level)
data = data.groupby(by=data.index, axis=0).sum()
data['Mean'] = data.mean(axis=1)
data = data.sort_values('Mean', ascending=False)
data = data.iloc[:40, :]
data = data.drop('Mean', axis=1).transpose()
if data.shape[1] < 20:
colors = colors_20[:data.shape[1]]+['#566573']
cols = 1
else:
colors = colors_40[:data.shape[1]]+['#566573']
cols = 2
data['Other'] = 100-data.sum(axis=1)
if legend:
data.plot.bar(stacked=True, ax=ax, color=colors, edgecolor='k', width=0.8)
leg = ax.legend(loc='upper left', bbox_to_anchor=(1.01, 0.9), fontsize=8, ncol=cols)
else:
data.plot.bar(stacked=True, ax=ax, legend=False, color=colors, edgecolor='k', width=0.8)
plt.ylabel('Relative abundance(%)')
plt.xticks([])
plt.xlim([-0.5, data.shape[0]-0.5])
plt.ylim([0,100])
plt.sca(sml_ax)
col_md = []
x, y = [a for a in range(len(data.index.values))], [1 for a in range(len(data.index.values))]
count = 0
for sample in data.index.values:
col_md.append(color_group[md.loc[sample, 'Treatment']])
ax_sml.text(count, 0.5, md.loc[sample, 'Location'][-1], va='center', ha='center')
count += 1
ax_sml.bar(x, y, color=col_md, edgecolor='k', width=1)
if legend:
ax.legend(handles=handles_color, loc='upper left', bbox_to_anchor=(1.01, 1.05))
ax.add_artist(leg)
plt.xticks(x, data.index.values, rotation=90)
plt.xlim([-0.5, data.shape[0]-0.5])
plt.yticks([])
return
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
labels = ['2ndBatchDNA4', '2ndBatchDNA5', '2ndBatchDNA6', '2ndBatchDNA1', '2ndBatchDNA2', '2ndBatchDNA3', '2ndBatchDNA7', '2ndBatchDNA8', '2ndBatchDNA9', 'DNA7', 'DNA9', 'DNA10', 'DNA11', 'DNA12', 'DNA13', 'DNA14', 'DNA15', 'DNA16', 'DNA17', 'DNA18']
ft = ft.loc[:, labels]
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 0)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
ft = ft.loc[:, labels]
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 1)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
ft = ft.loc[:, labels]
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 2)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
ft = ft.loc[:, labels]
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 3)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
ft = ft.loc[:, labels]
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 4)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
ft = ft.loc[:, labels]
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 5)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
ft = ft.loc[:, labels]
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6, rowspan=1)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 6)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
And this is the abundance of the rarefied tables.
ft_rare = pd.read_csv(folder+'otu_table_rare.csv', header=0, index_col=0)
ft_ra = ft_rare.divide(ft_rare.sum(axis=0), axis=1).multiply(100)
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 0)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_rare = pd.read_csv(folder+'otu_table_rare.csv', header=0, index_col=0)
ft_ra = ft_rare.divide(ft_rare.sum(axis=0), axis=1).multiply(100)
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 1)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_rare = pd.read_csv(folder+'otu_table_rare.csv', header=0, index_col=0)
ft_ra = ft_rare.divide(ft_rare.sum(axis=0), axis=1).multiply(100)
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 2)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_rare = pd.read_csv(folder+'otu_table_rare.csv', header=0, index_col=0)
ft_ra = ft_rare.divide(ft_rare.sum(axis=0), axis=1).multiply(100)
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 3)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_rare = pd.read_csv(folder+'otu_table_rare.csv', header=0, index_col=0)
ft_ra = ft_rare.divide(ft_rare.sum(axis=0), axis=1).multiply(100)
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 4)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_rare = pd.read_csv(folder+'otu_table_rare.csv', header=0, index_col=0)
ft_ra = ft_rare.divide(ft_rare.sum(axis=0), axis=1).multiply(100)
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 5)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
ft_rare = pd.read_csv(folder+'otu_table_rare.csv', header=0, index_col=0)
ft_ra = ft_rare.divide(ft_rare.sum(axis=0), axis=1).multiply(100)
ft_ra = ft_ra.loc[:, labels]
plt.figure(figsize=(14,10))
ax1 = plt.subplot2grid((16,10), (0, 0), rowspan=8, colspan=6)
sml_ax = plt.subplot2grid((24,10), (12, 0), colspan=6)
plot_bar(ax1, sml_ax, ft_ra, tax_dict, 6)
plt.subplots_adjust(hspace=0)
plt.tight_layout()
plt.show()
Each of these are plotting genera (or the lowest classification that we have for each ASV). Yellow shows high abundance whereas dark blue shows low abundance. Where they are normalised, all of the values for a taxon are scaled to the maximum abundance within that taxa (i.e. from 0 to 1).
def dendro_group(data, data_plot, tax_dictionary, level=5, dist_met='braycurtis', norm=True):
# dendrogram
plt.figure(figsize=(10,15))
ax1 = plt.subplot2grid((20,10), (0,2), rowspan=2, frameon=False, colspan=8)
ax2 = plt.subplot2grid((20,10), (2,2), colspan=8)
ax3 = plt.subplot2grid((20,10), (5,2), rowspan=15, colspan=8)
plt.sca(ax1)
df = pd.DataFrame(data).transpose()
if dist_met != False:
X = df.iloc[0:].values
y = df.iloc[:,0].values
X_true = X
similarities = distance.cdist(X_true, X_true, dist_met)
else:
similarities = np.nan_to_num(df)
Z = ssd.squareform(similarities)
Z = hierarchy.linkage(Z, "ward")
mpl.rcParams['lines.linewidth'] = 2
hierarchy.set_link_color_palette(['k'])
dn = hierarchy.dendrogram(Z, above_threshold_color='k')
x_labels, locs, xlocs, labels = list(ax1.get_xticklabels()), list(ax1.get_xticks()), list(ax1.get_yticks()), []
for x in x_labels:
labels.append(x.get_text())
order_names = [list(df.index.values)[int(l)] for l in labels]
#plt.xticks(locs, order_names, rotation=90)
plt.xticks([])
plt.yticks([])
#sample groups
plt.sca(ax2)
col_md = []
x, y = [a for a in range(len(data.index.values))], [1 for a in range(len(data.index.values))]
count = 0
for sample in order_names:
col_md.append(color_group[md.loc[sample, 'Treatment']])
ax2.text(count, 0.5, md.loc[sample, 'Location'][-1], va='center', ha='center')
count += 1
ax2.bar(x, y, color=col_md, edgecolor='k', width=1)
#ax2.legend(handles=handles_color, loc='upper left', bbox_to_anchor=(1.01, 1.05))
plt.xticks(x, order_names, rotation=90)
plt.xlim([-0.5, data.shape[0]-0.5]), plt.ylim([0,1])
plt.yticks([])
#heatmap
plt.sca(ax3)
data_plotting = pd.DataFrame(data_plot.loc[:, order_names])
rename_genus = {}
for asv in data_plotting.index.values:
tax = tax_dictionary[asv]
if len(tax) >= (level+1):
rename_genus[asv] = tax[level]
else:
rename_genus[asv] = tax[-1]
data_plotting = data_plotting.rename(index=rename_genus)
data_plotting = data_plotting.groupby(by=data_plotting.index, axis=0).sum()
data_plotting['Mean'] = data_plotting.mean(axis=1)
data_plotting = data_plotting.sort_values('Mean', ascending=True).iloc[-40:, :]
data_plotting = data_plotting.drop(['Mean'], axis=1)
if norm:
data_plotting = data_plotting.div(data_plotting.max(axis=1), axis=0)
plt.pcolor(data_plotting)
plt.xticks([])
y = [a+0.5 for a in range(0,40)]
plt.yticks(y, data_plotting.index.values)
return
w_uf_rare = pd.read_csv(folder+'weighted_unifrac_rare.csv', header=0, index_col=0)
dendro_group(w_uf_rare, ft_rare, tax_dict, dist_met=False, norm=True)
plt.tight_layout()
plt.show()
w_uf_rare = pd.read_csv(folder+'weighted_unifrac_rare.csv', header=0, index_col=0)
dendro_group(w_uf_rare, ft_rare, tax_dict, dist_met=False, norm=False)
plt.tight_layout()
plt.show()
uw_uf_rare = pd.read_csv(folder+'unweighted_unifrac_rare.csv', header=0, index_col=0)
dendro_group(uw_uf_rare, ft_rare, tax_dict, dist_met=False, norm=True)
plt.tight_layout()
plt.show()
uw_uf_rare = pd.read_csv(folder+'unweighted_unifrac_rare.csv', header=0, index_col=0)
dendro_group(uw_uf_rare, ft_rare, tax_dict, dist_met=False, norm=False)
plt.tight_layout()
plt.show()
clr = pd.read_csv(folder+'otu_table_clr.csv', header=0, index_col=0)
philr = pd.read_csv(folder+'philr_distance.csv', header=0, index_col=0)
dendro_group(philr, clr, tax_dict, dist_met=False, norm=True)
plt.tight_layout()
plt.show()
clr = pd.read_csv(folder+'otu_table_clr.csv', header=0, index_col=0)
philr = pd.read_csv(folder+'philr_distance.csv', header=0, index_col=0)
dendro_group(philr, clr, tax_dict, dist_met=False, norm=False)
plt.tight_layout()
plt.show()
Here we’ll use ANCOM to determine whether taxa are differentially abundant between different treatments. There are lots of different methods for differential abundance testing (e.g. see here - didn’t include ANCOM as it takes a very long time to run on large datasets), but many tools return lots of taxa as being differentially abundant but have very high false positive rates. ANCOM will return fewer taxa but has a much lower false positive rate. First we’ll save feature tables at each phylogenetic level so we can test them all:
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
all_ft = []
for a in range(len(levels)):
ft_level = pd.DataFrame(ft)
rename_dict = {}
for asv in ft_level.index.values:
tax = tax_dict[asv]
if len(tax) < (a+1):
rename_dict[asv] = tax[-1]
else:
rename_dict[asv] = tax[a]
ft_level = ft_level.rename(index=rename_dict)
ft_level = ft_level.groupby(by=ft_level.index, axis=0).sum()
all_ft.append(ft_level)
all_ft.append(ft)
Get the metadata table
md = pd.read_csv(folder+'exports/metadata.txt', sep='\t', header=0, index_col=0)
md_r = md.loc[:, 'Treatment'].reset_index().rename(columns={'#SampleID':'Samples', 'Treatment':'Groups'})
Do the ANCOM tests:
source("/Users/robynwright/Documents/OneDrive/Github/R-notebooks/ancom_v2.1.R")
ft = py$all_ft
md = py$md_r
feature_table = ft[1]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_domain = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_domain = ancom_out_domain$out
feature_table = ft[2]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_phylum = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_phylum = ancom_out_phylum$out
feature_table = ft[3]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_class = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_class = ancom_out_class$out
feature_table = ft[4]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_order = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_order = ancom_out_order$out
feature_table = ft[5]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_family = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_family = ancom_out_family$out
feature_table = ft[6]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_genus = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_genus = ancom_out_genus$out
feature_table = ft[7]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_species = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_species = ancom_out_species$out
feature_table = ft[8]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_ASV = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_ASV = ancom_out_ASV$out
Explore results:
# ancom = [r.ancom_domain, r.ancom_phylum, r.ancom_class, r.ancom_order, r.ancom_family, r.ancom_genus, r.ancom_species, r.ancom_ASV]
# with open(folder+'all_ancom_treatment.list', 'wb') as f: pickle.dump(ancom, f)
with open(folder+'all_ancom_treatment.list', 'rb') as f: ancom = pickle.load(f)
for level in ancom:
for tax in level.index.values:
if True in level.loc[tax, :].values[2:]:
print(level.loc[tax, :].values)
So the only significant differences are with 8 ASVs. First we’ll plot the relative abundance of each taxa with the control and treatment groups:
with open(folder+'all_ancom_treatment.list', 'rb') as f: ancom = pickle.load(f)
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
ft_using = pd.DataFrame(ft_ra)
plt.figure(figsize=(10,6))
ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8 = plt.subplot(241), plt.subplot(242), plt.subplot(243), plt.subplot(244), plt.subplot(245), plt.subplot(246), plt.subplot(247), plt.subplot(248)
sigs = []
for tax in ancom[-1].index.values:
if True in level.loc[tax, :].values[2:]:
sigs.append(level.loc[tax, :].values[0])
rename_treat = {}
for sample in ft_using.columns:
rename_treat[sample] = md.loc[sample, 'Treatment']
ft_using = ft_using.rename(columns=rename_treat)
axes = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]
for a in range(len(sigs)):
sig = sigs[a]
tax = tax_dict[sig]
tax_ra = pd.DataFrame(ft_using.loc[sig, :])
treat, cont = tax_ra.loc['Treatment', :], tax_ra.loc['Control', :]
axes[a].boxplot([treat, cont])
plt.sca(axes[a])
if a < 4:
plt.xticks([1,2], ['', ''])
else:
plt.xticks([1, 2], ['Treatment', 'Control'])
if a in [0, 4]: plt.ylabel('Relative abundance (%)')
t1 = tax[-1].split('_')
t2 = t1[0]+t1[1]
for t in range(2, len(t1)):
t2+=' '+t1[t]
if t == 5: t2 += '\n'
plt.title(t2, fontsize=10)
plt.tight_layout()
plt.show()
Now plot the CLR:
with open(folder+'all_ancom_treatment.list', 'rb') as f: ancom = pickle.load(f)
ft = pd.read_csv(folder+'exports/feature_table_no_tax.csv', header=0, index_col=0)
ft_ra = ft.divide(ft.sum(axis=0), axis=1).multiply(100)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
ft_using = pd.DataFrame(clr)
plt.figure(figsize=(10,6))
ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8 = plt.subplot(241), plt.subplot(242), plt.subplot(243), plt.subplot(244), plt.subplot(245), plt.subplot(246), plt.subplot(247), plt.subplot(248)
sigs = []
for tax in ancom[-1].index.values:
if True in level.loc[tax, :].values[2:]:
sigs.append(level.loc[tax, :].values[0])
rename_treat = {}
for sample in ft_using.columns:
rename_treat[sample] = md.loc[sample, 'Treatment']
ft_using = ft_using.rename(columns=rename_treat)
axes = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]
for a in range(len(sigs)):
sig = sigs[a]
tax = tax_dict[sig]
tax_ra = pd.DataFrame(ft_using.loc[sig, :])
treat, cont = tax_ra.loc['Treatment', :], tax_ra.loc['Control', :]
axes[a].boxplot([treat, cont])
plt.sca(axes[a])
if a < 4:
plt.xticks([1,2], ['', ''])
else:
plt.xticks([1, 2], ['Treatment', 'Control'])
if a in [0, 4]: plt.ylabel('CLR abundance')
t1 = tax[-1].split('_')
t2 = t1[0]+t1[1]
for t in range(2, len(t1)):
t2+=' '+t1[t]
if t == 5: t2 += '\n'
plt.title(t2, fontsize=10)
plt.tight_layout()
plt.show()
Get the metadata table:
md = pd.read_csv(folder+'exports/metadata.txt', sep='\t', header=0, index_col=0)
md_r = md.loc[:, 'Location'].reset_index().rename(columns={'#SampleID':'Samples', 'Location':'Groups'})
Do the ANCOM tests:
source("/Users/robynwright/Documents/OneDrive/Github/R-notebooks/ancom_v2.1.R")
ft = py$all_ft
md = py$md_r
feature_table = ft[1]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_domain = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_domain = ancom_out_domain$out
feature_table = ft[2]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_phylum = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_phylum = ancom_out_phylum$out
feature_table = ft[3]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_class = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_class = ancom_out_class$out
feature_table = ft[4]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_order = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_order = ancom_out_order$out
feature_table = ft[5]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_family = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_family = ancom_out_family$out
feature_table = ft[6]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_genus = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_genus = ancom_out_genus$out
feature_table = ft[7]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_species = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_species = ancom_out_species$out
feature_table = ft[8]
process = feature_table_pre_process(feature_table, md, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out_ASV = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
ancom_ASV = ancom_out_ASV$out
Explore results:
# ancom = [r.ancom_domain, r.ancom_phylum, r.ancom_class, r.ancom_order, r.ancom_family, r.ancom_genus, r.ancom_species, r.ancom_ASV]
# with open(folder+'all_ancom_location.list', 'wb') as f: pickle.dump(ancom, f)
with open(folder+'all_ancom_location.list', 'rb') as f: ancom = pickle.load(f)
levels = ["Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "ASV"]
for l in range(len(ancom)):
level = ancom[l]
sig = []
for tax in level.index.values:
if True == level.loc[tax, :].values[2]:
sig.append(level.loc[tax, :].values[0])
print(levels[l], len(sig))
## Domain 0
## Phylum 6
## Class 14
## Order 30
## Family 49
## Genus 60
## Species 67
## ASV 735
So now we have a lot of differences. So we’ll work through by taxonomic level and maybe do something a little different for each. We’ll only plot the CLR values here. Phylum:
with open(folder+'all_ancom_location.list', 'rb') as f: ancom = pickle.load(f)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
plt.figure(figsize=(10,6))
ax1, ax2, ax3, ax4, ax5, ax6 = plt.subplot(231), plt.subplot(232), plt.subplot(233), plt.subplot(234), plt.subplot(235), plt.subplot(236)
axes = [ax1, ax2, ax3, ax4, ax5, ax6]
ancom_diff = ancom[1]
rename_phyla = {}
for asv in clr.index.values:
if len(tax_dict[asv]) > 1:
rename_phyla[asv] = tax_dict[asv][1]
else:
rename_phyla[asv] = tax_dict[asv][0]
clr_phyla = clr.rename(index=rename_phyla)
clr_phyla = clr_phyla.groupby(by=clr_phyla.index, axis=0).sum()
rename_treat = {}
for sample in clr_phyla.columns:
rename_treat[sample] = md.loc[sample, 'Location']
clr_phyla = clr_phyla.rename(columns=rename_treat)
count = 0
for row in ancom_diff.index.values:
if ancom_diff.loc[row, :].values[2] == True:
tax = ancom_diff.loc[row, :].values[0]
tax_ra = pd.DataFrame(clr_phyla.loc[tax, :])
s1, s2, s3, s4, s5 = tax_ra.loc['Site1', :], tax_ra.loc['Site2', :], tax_ra.loc['Site3', :], tax_ra.loc['Site4', :], tax_ra.loc['Site5', :]
axes[count].boxplot([s1, s2, s3, s4, s5])
plt.sca(axes[count])
if count < 3:
plt.xticks([1,2,3,4,5], ['', '', '', '', ''])
else:
plt.xticks([1,2,3,4,5], ['Site1', 'Site2', 'Site3', 'Site4', 'Site5'])
if count in [0, 3]: plt.ylabel('CLR abundance')
t1 = tax.split('_')
t2 = t1[0]+t1[1]
for t in range(2, len(t1)):
t2+=' '+t1[t]
if t == 5: t2 += '\n'
plt.title(t2, fontsize=10)
count += 1
plt.tight_layout()
plt.show()
Class:
with open(folder+'all_ancom_location.list', 'rb') as f: ancom = pickle.load(f)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
plt.figure(figsize=(10,10))
ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, ax11, ax12, ax13, ax14 = plt.subplot(3,5,1), plt.subplot(3,5,2), plt.subplot(3,5,3), plt.subplot(3,5,4), plt.subplot(3,5,5), plt.subplot(3,5,6), plt.subplot(3,5,7), plt.subplot(3,5,8), plt.subplot(3,5,9), plt.subplot(3,5,10), plt.subplot(3,5,11), plt.subplot(3,5,12), plt.subplot(3,5,13), plt.subplot(3,5,14)
axes = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, ax11, ax12, ax13, ax14]
ancom_diff = ancom[2]
rename_class = {}
for asv in clr.index.values:
if len(tax_dict[asv]) > 2:
rename_class[asv] = tax_dict[asv][2]
else:
rename_class[asv] = tax_dict[asv][-1]
clr_class = clr.rename(index=rename_class)
clr_class = clr_class.groupby(by=clr_class.index, axis=0).sum()
rename_treat = {}
for sample in clr_class.columns:
rename_treat[sample] = md.loc[sample, 'Location']
clr_class = clr_class.rename(columns=rename_treat)
count = 0
for row in ancom_diff.index.values:
if ancom_diff.loc[row, :].values[2] == True:
tax = ancom_diff.loc[row, :].values[0]
tax_ra = pd.DataFrame(clr_class.loc[tax, :])
s1, s2, s3, s4, s5 = tax_ra.loc['Site1', :], tax_ra.loc['Site2', :], tax_ra.loc['Site3', :], tax_ra.loc['Site4', :], tax_ra.loc['Site5', :]
axes[count].boxplot([s1, s2, s3, s4, s5])
plt.sca(axes[count])
if count < 9:
plt.xticks([1,2,3,4,5], ['', '', '', '', ''])
else:
plt.xticks([1,2,3,4,5], ['Site1', 'Site2', 'Site3', 'Site4', 'Site5'], rotation=90)
if count in [0, 5, 10]: plt.ylabel('CLR abundance')
t1 = tax.split('_')
t2 = t1[0]+t1[1]
for t in range(2, len(t1)):
t2+=' '+t1[t]
if t == 5: t2 += '\n'
plt.title(t2, fontsize=10)
count += 1
plt.tight_layout()
plt.show()
Order:
with open(folder+'all_ancom_location.list', 'rb') as f: ancom = pickle.load(f)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
plt.figure(figsize=(10,10))
ax1 = plt.subplot(111)
lv = 3
ancom_diff = ancom[lv]
rename_level = {}
for asv in clr.index.values:
if len(tax_dict[asv]) > lv:
rename_level[asv] = tax_dict[asv][lv]
else:
rename_level[asv] = tax_dict[asv][-1]
clr_level = clr.rename(index=rename_level)
clr_level = clr_level.groupby(by=clr_level.index, axis=0).sum()
rename_treat = {}
for sample in clr_level.columns:
rename_treat[sample] = md.loc[sample, 'Location']
clr_level = clr_level.rename(columns=rename_treat)
sort_names = sorted(set(clr_level.columns))
clr_level = clr_level.loc[:, sort_names]
sigs = []
for row in ancom_diff.index.values:
if ancom_diff.loc[row, :].values[2] == True:
sigs.append(ancom_diff.loc[row, :].values[0])
clr_level = clr_level.loc[sigs, :]
plt.pcolor(clr_level)
plt.xticks([])
y = [a+0.5 for a in range(0,clr_level.shape[0])]
x = [a+0.5 for a in range(0,clr_level.shape[1])]
plt.yticks(y, clr_level.index.values)
plt.xticks(x, clr_level.columns, rotation=90)
plt.tight_layout()
plt.show()
Family:
with open(folder+'all_ancom_location.list', 'rb') as f: ancom = pickle.load(f)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
plt.figure(figsize=(10,10))
ax1 = plt.subplot(111)
lv = 4
ancom_diff = ancom[lv]
rename_level = {}
for asv in clr.index.values:
if len(tax_dict[asv]) > lv:
rename_level[asv] = tax_dict[asv][lv]
else:
rename_level[asv] = tax_dict[asv][-1]
clr_level = clr.rename(index=rename_level)
clr_level = clr_level.groupby(by=clr_level.index, axis=0).sum()
rename_treat = {}
for sample in clr_level.columns:
rename_treat[sample] = md.loc[sample, 'Location']
clr_level = clr_level.rename(columns=rename_treat)
sort_names = sorted(set(clr_level.columns))
clr_level = clr_level.loc[:, sort_names]
sigs = []
for row in ancom_diff.index.values:
if ancom_diff.loc[row, :].values[2] == True:
sigs.append(ancom_diff.loc[row, :].values[0])
clr_level = clr_level.loc[sigs, :]
print(clr_level.shape[0])
plt.pcolor(clr_level)
plt.xticks([])
y = [a+0.5 for a in range(0,clr_level.shape[0])]
x = [a+0.5 for a in range(0,clr_level.shape[1])]
plt.yticks(y, clr_level.index.values)
plt.xticks(x, clr_level.columns, rotation=90)
plt.tight_layout()
plt.show()
Genus:
with open(folder+'all_ancom_location.list', 'rb') as f: ancom = pickle.load(f)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
plt.figure(figsize=(10,10))
ax1 = plt.subplot(111)
lv = 5
ancom_diff = ancom[lv]
rename_level = {}
for asv in clr.index.values:
if len(tax_dict[asv]) > lv:
rename_level[asv] = tax_dict[asv][lv]
else:
rename_level[asv] = tax_dict[asv][-1]
clr_level = clr.rename(index=rename_level)
clr_level = clr_level.groupby(by=clr_level.index, axis=0).sum()
rename_treat = {}
for sample in clr_level.columns:
rename_treat[sample] = md.loc[sample, 'Location']
clr_level = clr_level.rename(columns=rename_treat)
sort_names = sorted(set(clr_level.columns))
clr_level = clr_level.loc[:, sort_names]
sigs = []
for row in ancom_diff.index.values:
if ancom_diff.loc[row, :].values[2] == True:
sigs.append(ancom_diff.loc[row, :].values[0])
clr_level = clr_level.loc[sigs, :]
print(clr_level.shape[0])
plt.pcolor(clr_level)
plt.xticks([])
y = [a+0.5 for a in range(0,clr_level.shape[0])]
x = [a+0.5 for a in range(0,clr_level.shape[1])]
plt.yticks(y, clr_level.index.values, fontsize=8)
plt.xticks(x, clr_level.columns, rotation=90)
plt.tight_layout()
plt.show()
Species:
with open(folder+'all_ancom_location.list', 'rb') as f: ancom = pickle.load(f)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
plt.figure(figsize=(10,10))
ax1 = plt.subplot(111)
lv = 6
ancom_diff = ancom[lv]
rename_level = {}
for asv in clr.index.values:
if len(tax_dict[asv]) > lv:
rename_level[asv] = tax_dict[asv][lv]
else:
rename_level[asv] = tax_dict[asv][-1]
clr_level = clr.rename(index=rename_level)
clr_level = clr_level.groupby(by=clr_level.index, axis=0).sum()
rename_treat = {}
for sample in clr_level.columns:
rename_treat[sample] = md.loc[sample, 'Location']
clr_level = clr_level.rename(columns=rename_treat)
sort_names = sorted(set(clr_level.columns))
clr_level = clr_level.loc[:, sort_names]
sigs = []
for row in ancom_diff.index.values:
if ancom_diff.loc[row, :].values[2] == True:
sigs.append(ancom_diff.loc[row, :].values[0])
clr_level = clr_level.loc[sigs, :]
print(clr_level.shape[0])
plt.pcolor(clr_level)
plt.xticks([])
y = [a+0.5 for a in range(0,clr_level.shape[0])]
x = [a+0.5 for a in range(0,clr_level.shape[1])]
plt.yticks(y, clr_level.index.values, fontsize=8)
plt.xticks(x, clr_level.columns, rotation=90)
plt.tight_layout()
plt.show()
ASV (renamed to the lowest classification available):
with open(folder+'all_ancom_location.list', 'rb') as f: ancom = pickle.load(f)
clr = pd.read_csv(folder+'otu_table_clr.csv', index_col=0, header=0)
plt.figure(figsize=(10,30))
ax1 = plt.subplot(111)
lv = 7
ancom_diff = ancom[lv]
rename_level = {}
for asv in clr.index.values:
if len(tax_dict[asv]) > lv:
rename_level[asv] = tax_dict[asv][lv]
else:
rename_level[asv] = tax_dict[asv][-1]
clr_level = pd.DataFrame(clr)
rename_treat = {}
for sample in clr_level.columns:
rename_treat[sample] = md.loc[sample, 'Location']
clr_level = clr_level.rename(columns=rename_treat)
sort_names = sorted(set(clr_level.columns))
clr_level = clr_level.loc[:, sort_names]
sigs = []
for row in ancom_diff.index.values:
if ancom_diff.loc[row, :].values[2] == True:
sigs.append(ancom_diff.loc[row, :].values[0])
clr_level = clr_level.loc[sigs, :]
clr_level = clr_level.rename(index=rename_level)
clr_level = clr_level.groupby(by=clr_level.index, axis=0).sum()
plt.pcolor(clr_level)
plt.xticks([])
y = [a+0.5 for a in range(0,clr_level.shape[0])]
x = [a+0.5 for a in range(0,clr_level.shape[1])]
plt.yticks(y, clr_level.index.values, fontsize=8)
plt.xticks(x, clr_level.columns, rotation=90)
plt.tight_layout()
plt.show()
We’ll mainly use Phyloseq for this analysis as it incorporates a lot of plotting and normalisation functions.
I’m starting with the feature table where I removed the first row (the “# constructed from biom file”) - this makes it a lot easier to read in. I’ve also edited the sample names as R doesn’t like sample names that begin with a number (so instead of reading 2ndBatch they are now Batch2).
options(warn = -1)
asv_table <- read.csv(paste(folder, "exports/feature_table_R.csv", sep="")) #read in table from file
sampledata <- read.csv(paste(folder, "exports/metadata_R.csv", sep="")) #read in the metadata table
phy_tree <- read_tree(paste(folder, "exports/tree.nwk", sep='')) #read in the phylogenetic tree
taxonomy = asv_table[, c(1, 22)] #take only the OTU ID and taxonomy column to a new table
asv_table = asv_table[, 1:21] #take the OTU ID and the other columns to be the ASV table
asv_table_num = data.matrix(asv_table[,2:21]) #convert the ASV table to a numeric matric
rownames(asv_table_num) = asv_table[,1] #give the matrix row names
asv_table = as.matrix(asv_table_num) #convert it to a matrix
taxonomy <- separate(data = taxonomy, col = taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = "\\;") #separate the taxonomy table so each phylogenetic level is its own column
taxmat <- taxonomy[,-1] #remove the OTU ID column from the taxonomy table
rownames(taxmat) <- taxonomy[,1] #and now give the taxonomy table the OTU IDs as row names
samples <- sampledata[, 2:5] #get the metadata columns
rownames(samples) = sampledata[,1] #and add the sample names as row names
samples = data.frame(samples, stringsAsFactors = FALSE) #convert this to a data frame
#convert these to phyloseq objects
ASV = otu_table(asv_table, taxa_are_rows = TRUE)
TAX = tax_table(taxmat)
SAMPLE = sample_data(samples)
taxa_names(TAX) <- taxonomy[,1]
physeq = phyloseq(ASV,phy_tree,TAX,SAMPLE)
We’ll perform all of the normalisations (as we did above) that we will want to use at some point
options(warn = -1)
physeq_rare <- rarefy_even_depth(physeq, sample.size = min(sample_sums(physeq)), replace = TRUE, trimOTUs = TRUE, verbose = TRUE) #rarefy to the lowest sample depth
physeq_clr <- microbiome::transform(physeq, "clr") #convert to CLR
physeq_relabun <- transform_sample_counts(physeq, function(x) (x / sum(x))*100) #convert to relative abundance
options(warn = -1)
rarecurve(t(otu_table(physeq)), step=50, cex=0.5)
Richness (observed ASVs), Chao1, Simpsons & Shannon diversity. The diversity measures can’t be calculated on numbers that aren’t integers, so we can’t calculate these for the relative abundance or CLR transformed data.
options(warn = -1)
plot_richness(physeq, measures=c("Observed", "Chao1", "Simpson", "Shannon"))
options(warn = -1)
plot_richness(physeq, x="Location", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
options(warn = -1)
plot_richness(physeq, x="Treatment", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
Richness (observed ASVs), Chao1, Simpsons & Shannon diversity.
options(warn = -1)
plot_richness(physeq_rare, measures=c("Observed", "Chao1", "Simpson", "Shannon"))
options(warn = -1)
plot_richness(physeq_rare, x="Location", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
options(warn = -1)
plot_richness(physeq_rare, x="Treatment", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
We have NMDS plots above (and these will look the same as those if we choose to use NMDS here), so here I’m plotting PCoA, so the % variation explained by each axis is in brackets. The results of adonis (permanova) tests for significant differences between groups are also shown. Location is samples grouped only to sites while Location2 has information on Treatment and Location.
options(warn = -1)
ps = physeq
ps.ord <- ordinate(ps, "PCoA", "bray")
plot_ordination(ps, ps.ord, type="samples", color="Treatment", shape="Location")
distance <- phyloseq::distance(ps, method="bray", weighted=F)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.2943 0.29427 0.82318 0.04373 0.589
## Residuals 18 6.4345 0.35747 0.95627
## Total 19 6.7288 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 1.1572 0.28929 0.77882 0.17197 0.883
## Residuals 15 5.5716 0.37144 0.82803
## Total 19 6.7288 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 1.8216 0.30359 0.80426 0.27071 0.865
## Residuals 13 4.9072 0.37748 0.72929
## Total 19 6.7288 1.00000
options(warn = -1)
ps = physeq_rare
ps.ord <- ordinate(ps, "PCoA", "bray")
plot_ordination(ps, ps.ord, type="samples", color="Treatment", shape="Location")
distance <- phyloseq::distance(ps, method="bray", weighted=F)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.3075 0.30751 0.92814 0.04904 0.438
## Residuals 18 5.9638 0.33132 0.95096
## Total 19 6.2713 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 0.9120 0.22801 0.63819 0.14543 0.97
## Residuals 15 5.3592 0.35728 0.85457
## Total 19 6.2713 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 1.4671 0.24451 0.66165 0.23394 0.968
## Residuals 13 4.8042 0.36955 0.76606
## Total 19 6.2713 1.00000
options(warn = -1)
ps = physeq_relabun
ps.ord <- ordinate(ps, "PCoA", "bray")
plot_ordination(ps, ps.ord, type="samples", color="Treatment", shape="Location")
distance <- phyloseq::distance(ps, method="bray", weighted=F)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.3037 0.30366 0.93038 0.04915 0.41
## Residuals 18 5.8749 0.32638 0.95085
## Total 19 6.1785 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 0.8802 0.22004 0.62295 0.14246 0.965
## Residuals 15 5.2984 0.35322 0.85754
## Total 19 6.1785 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 1.4096 0.23494 0.64043 0.22815 0.974
## Residuals 13 4.7689 0.36684 0.77185
## Total 19 6.1785 1.00000
Here the first 3 adonis tests are for weighted unifrac and the second 3 are for unweighed.
options(warn = -1)
ps = physeq
ps.ord.w <- ordinate(ps, "PCoA", "unifrac", weighted=TRUE)
w <- plot_ordination(ps, ps.ord.w, type="samples", color="Treatment", shape="Location", title="Weighted Unifrac")
ps.ord.uw <- ordinate(ps, "PCoA", "unifrac", weighted=FALSE)
uw <- plot_ordination(ps, ps.ord.uw, type="samples", color="Treatment", shape="Location", title="Unweighted Unifrac")
grid.arrange(w, uw, nrow = 1)
distance <- phyloseq::distance(ps, method="unifrac", weighted=T)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.00856 0.0085581 0.45908 0.02487 0.848
## Residuals 18 0.33555 0.0186416 0.97513
## Total 19 0.34411 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 0.05275 0.013187 0.67889 0.15329 0.841
## Residuals 15 0.29136 0.019424 0.84671
## Total 19 0.34411 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 0.06640 0.011067 0.51806 0.19296 0.978
## Residuals 13 0.27771 0.021362 0.80704
## Total 19 0.34411 1.00000
distance <- phyloseq::distance(ps, method="unifrac", weighted=F)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.1884 0.18844 1.0399 0.05462 0.298
## Residuals 18 3.2618 0.18121 0.94538
## Total 19 3.4503 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 0.4880 0.12200 0.61778 0.14144 0.956
## Residuals 15 2.9623 0.19748 0.85856
## Total 19 3.4503 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 0.8185 0.13642 0.67385 0.23723 0.945
## Residuals 13 2.6318 0.20245 0.76277
## Total 19 3.4503 1.00000
Here the first 3 adonis tests are for weighted unifrac and the second 3 are for unweighed.
options(warn = -1)
ps = physeq_rare
ps.ord.w <- ordinate(ps, "PCoA", "unifrac", weighted=TRUE)
w <- plot_ordination(ps, ps.ord.w, type="samples", color="Treatment", shape="Location", title="Weighted Unifrac")
ps.ord.uw <- ordinate(ps, "PCoA", "unifrac", weighted=FALSE)
uw <- plot_ordination(ps, ps.ord.uw, type="samples", color="Treatment", shape="Location", title="Unweighted Unifrac")
grid.arrange(w, uw, nrow = 1)
distance <- phyloseq::distance(ps, method="unifrac", weighted=T)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.00924 0.0092431 0.49065 0.02653 0.832
## Residuals 18 0.33909 0.0188385 0.97347
## Total 19 0.34834 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 0.05281 0.013203 0.67017 0.15162 0.844
## Residuals 15 0.29552 0.019702 0.84838
## Total 19 0.34834 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 0.06781 0.011301 0.52371 0.19466 0.982
## Residuals 13 0.28053 0.021579 0.80534
## Total 19 0.34834 1.00000
distance <- phyloseq::distance(ps, method="unifrac", weighted=F)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.2011 0.20114 1.0828 0.05674 0.285
## Residuals 18 3.3438 0.18577 0.94326
## Total 19 3.5449 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 0.5161 0.12902 0.63896 0.14558 0.98
## Residuals 15 3.0288 0.20192 0.85442
## Total 19 3.5449 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 0.8431 0.14052 0.67615 0.23785 0.966
## Residuals 13 2.7018 0.20783 0.76215
## Total 19 3.5449 1.00000
Here the first 3 adonis tests are for weighted unifrac and the second 3 are for unweighed.
options(warn = -1)
ps = physeq_relabun
ps.ord.w <- ordinate(ps, "PCoA", "unifrac", weighted=TRUE)
w <- plot_ordination(ps, ps.ord.w, type="samples", color="Treatment", shape="Location", title="Weighted Unifrac")
ps.ord.uw <- ordinate(ps, "PCoA", "unifrac", weighted=FALSE)
uw <- plot_ordination(ps, ps.ord.uw, type="samples", color="Treatment", shape="Location", title="Unweighted Unifrac")
grid.arrange(w, uw, nrow = 1)
distance <- phyloseq::distance(ps, method="unifrac", weighted=T)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.00856 0.0085581 0.45908 0.02487 0.88
## Residuals 18 0.33555 0.0186416 0.97513
## Total 19 0.34411 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 0.05275 0.013187 0.67889 0.15329 0.819
## Residuals 15 0.29136 0.019424 0.84671
## Total 19 0.34411 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 0.06640 0.011067 0.51806 0.19296 0.98
## Residuals 13 0.27771 0.021362 0.80704
## Total 19 0.34411 1.00000
distance <- phyloseq::distance(ps, method="unifrac", weighted=F)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 0.1884 0.18844 1.0399 0.05462 0.311
## Residuals 18 3.2618 0.18121 0.94538
## Total 19 3.4503 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 0.4880 0.12200 0.61778 0.14144 0.961
## Residuals 15 2.9623 0.19748 0.85856
## Total 19 3.4503 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 0.8185 0.13642 0.67385 0.23723 0.95
## Residuals 13 2.6318 0.20245 0.76277
## Total 19 3.4503 1.00000
options(warn = -1)
ps = physeq_clr
ps.ord <- ordinate(ps, "PCoA", "euclidean")
plot_ordination(ps, ps.ord, type="samples", color="Treatment", shape="Location")
distance <- phyloseq::distance(ps, method="euclidean", weighted=F)
print(adonis(distance ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 5957 5957.4 1.0354 0.05439 0.288
## Residuals 18 103571 5754.0 0.94561
## Total 19 109529 1.00000
print(adonis(distance ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 16985 4246.3 0.68827 0.15508 0.96
## Residuals 15 92543 6169.6 0.84492
## Total 19 109529 1.00000
print(adonis(distance ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = distance ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 26489 4414.8 0.69114 0.24184 0.969
## Residuals 13 83040 6387.7 0.75816
## Total 19 109529 1.00000
options(warn = -1)
physeq_philr = physeq
physeq_philr <- transform_sample_counts(physeq_philr, function(x) x+1)
phy_tree(physeq_philr) <- makeNodeLabel(phy_tree(physeq_philr), method="number", prefix='n')
otu.table <- t(otu_table(physeq_philr))
tree <- phy_tree(physeq_philr)
metadata <- sample_data(physeq_philr)
tax <- tax_table(physeq_philr)
ps = physeq_philr
physeq.philr <- philr(otu.table, tree, part.weights='enorm.x.gm.counts', ilr.weights='blw.sqrt')
philr.dist <- dist(physeq.philr, method="euclidean")
ps.ord <- ordinate(physeq, 'PCoA', distance=philr.dist)
plot_ordination(ps, ps.ord, type="samples", color="Treatment", shape="Location")
print(adonis(philr.dist ~ sample_data(ps)$Treatment))
##
## Call:
## adonis(formula = philr.dist ~ sample_data(ps)$Treatment)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Treatment 1 129.44 129.44 1.1224 0.0587 0.298
## Residuals 18 2075.76 115.32 0.9413
## Total 19 2205.20 1.0000
print(adonis(philr.dist ~ sample_data(ps)$Location))
##
## Call:
## adonis(formula = philr.dist ~ sample_data(ps)$Location)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location 4 327.72 81.931 0.65458 0.14861 0.861
## Residuals 15 1877.47 125.165 0.85139
## Total 19 2205.20 1.00000
print(adonis(philr.dist ~ sample_data(ps)$Location2))
##
## Call:
## adonis(formula = philr.dist ~ sample_data(ps)$Location2)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## sample_data(ps)$Location2 6 465.78 77.63 0.58019 0.21122 0.96
## Residuals 13 1739.42 133.80 0.78878
## Total 19 2205.20 1.00000
options(warn = -1)
palette = distinctColorPalette(30)
rnk = "ta2"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta3"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta4"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta5"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta6"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta7"
ps.rank = tax_glom(physeq_relabun, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta2"
ps.rank = tax_glom(physeq_rare, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta3"
ps.rank = tax_glom(physeq_rare, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta4"
ps.rank = tax_glom(physeq_rare, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta5"
ps.rank = tax_glom(physeq_rare, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta6"
ps.rank = tax_glom(physeq_rare, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
options(warn = -1)
rnk = "ta7"
ps.rank = tax_glom(physeq_rare, taxrank=rnk, NArm=FALSE)
rank.sum = tapply(taxa_sums(ps.rank), tax_table(ps.rank)[, rnk], sum, na.rm=TRUE)
top30 = names(sort(rank.sum, TRUE))[1:30]
ps.rank = prune_taxa((tax_table(ps.rank)[, rnk] %in% top30), ps.rank)
plot_bar(ps.rank, fill=rnk) + facet_wrap(c(~Treatment, ~Location), scales="free_x", nrow=1) + theme(legend.text=element_text(size=5), legend.key.size = unit(0.3, "cm")) + guides(fill=guide_legend(ncol=1)) + scale_fill_manual(values=palette)
Here we’re plotting phylogenetic trees that show the abundance of the top 50 taxa within different sample types. I’m plotting only relative abundance, just to get an idea of what can be done here.
options(warn = -1)
ps = physeq_relabun
top50 = names(sort(taxa_sums(ps), TRUE))[1:50]
ps_trim = prune_taxa(top50, ps)
plot_tree(ps_trim, color="Location", label.tips = "ta5", ladderize="left", size="Abundance")
options(warn = -1)
ps = physeq_relabun
top50 = names(sort(taxa_sums(ps), TRUE))[1:50]
ps_trim = prune_taxa(top50, ps)
plot_tree(ps_trim, color="Treatment", label.tips = "ta5", ladderize="left", size="Abundance")
Here we’re plotting colours based on the CLR abundance and showing the name of the lowest classification available for each ASV. We’ve just plotted the top 75 most abundant ASVs.
options(warn = -1)
library(microbiome)
library(phyloseq)
ps = physeq_clr
top75 = names(sort(taxa_sums(ps), TRUE))[1:75]
ps = prune_taxa(top75, ps)
tree = phy_tree(ps)
asv = otu_table(ps)
tax = tax_table(ps)
detach("package:microbiome", unload=TRUE, force=TRUE)
detach("package:phyloseq", unload=TRUE, force=TRUE)
library("ggtree")
asv_df = as.data.frame(asv)
tax_df = as.data.frame(tax, stringsAsFactors=F)
data_plot = asv_df[,sample_names(ps)]
rns <- rownames(samples)
locs <- samples[["Location2"]]
cn <- colnames(data_plot)
rename = c()
for (a in 1:length(cn)) {
for (b in 1:length(rns)) {
if (cn[a] == rns[b]) {
rename = c(rename, as.character(locs[b]))
}
}
}
colnames(data_plot) = rename
data_mean <- as.data.frame(sapply(unique(names(data_plot)), function(col) rowMeans(data_plot[names(data_plot) == col])))
asv_tax_full <- merge(asv_df, tax_df, by="row.names")
asv_tax <- asv_tax_full[,c("Row.names", "ta1", "ta2", "ta3", "ta4", "ta5", "ta6", "ta7")]
levels = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
colnames(asv_tax) <- c("label", levels)
levels = levels[7:1]
labels = tree$tip.label
species_name = c()
for (i in 1:length(labels)) {
for (j in 1:length(asv_tax$label)) {
if (labels[i] == asv_tax$label[j]) {
if (is.na(asv_tax$Species[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Species[j])
} else if (is.na(asv_tax$Genus[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Genus[j])
} else if (is.na(asv_tax$Family[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Family[j])
} else if (is.na(asv_tax$Order[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Order[j])
} else if (is.na(asv_tax$Class[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Class[j])
} else if (is.na(asv_tax$Phylum[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Phylum[j])
} else {
species_name <- c(species_name, asv_tax$Domain[j])
}
}
}
}
d <- data.frame(label=tree$tip.label, species=species_name)
p <- ggtree(tree, layout="fan", open.angle=15)
p <- gheatmap(p, data_mean, colnames_angle=90, font.size=2, hjust=1, color="black", offset=1.3) + scale_fill_viridis_b()
p <- p %<+% d + geom_tiplab(aes(label=species), parse=F, size=2, align=T, linesize=.12)
p
Here we’re plotting colours based on the CLR abundance and showing the name of the lowest classification available for each ASV. We’ve just plotted the top 75 most abundant ASVs.
options(warn = -1)
library(microbiome)
library(phyloseq)
ps = physeq_clr
top75 = names(sort(taxa_sums(ps), TRUE))[1:75]
ps = prune_taxa(top75, ps)
tree = phy_tree(ps)
asv = otu_table(ps)
tax = tax_table(ps)
detach("package:microbiome", unload=TRUE, force=TRUE)
detach("package:phyloseq", unload=TRUE, force=TRUE)
library("ggtree")
asv_df = as.data.frame(asv)
tax_df = as.data.frame(tax, stringsAsFactors=F)
data_plot = asv_df[,sample_names(ps)]
rns <- rownames(samples)
locs <- samples[["Treatment"]]
cn <- colnames(data_plot)
rename = c()
for (a in 1:length(cn)) {
for (b in 1:length(rns)) {
if (cn[a] == rns[b]) {
rename = c(rename, as.character(locs[b]))
}
}
}
colnames(data_plot) = rename
data_mean <- as.data.frame(sapply(unique(names(data_plot)), function(col) rowMeans(data_plot[names(data_plot) == col])))
asv_tax_full <- merge(asv_df, tax_df, by="row.names")
asv_tax <- asv_tax_full[,c("Row.names", "ta1", "ta2", "ta3", "ta4", "ta5", "ta6", "ta7")]
levels = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
colnames(asv_tax) <- c("label", levels)
levels = levels[7:1]
labels = tree$tip.label
species_name = c()
for (i in 1:length(labels)) {
for (j in 1:length(asv_tax$label)) {
if (labels[i] == asv_tax$label[j]) {
if (is.na(asv_tax$Species[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Species[j])
} else if (is.na(asv_tax$Genus[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Genus[j])
} else if (is.na(asv_tax$Family[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Family[j])
} else if (is.na(asv_tax$Order[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Order[j])
} else if (is.na(asv_tax$Class[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Class[j])
} else if (is.na(asv_tax$Phylum[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Phylum[j])
} else {
species_name <- c(species_name, asv_tax$Domain[j])
}
}
}
}
d <- data.frame(label=tree$tip.label, species=species_name)
p <- ggtree(tree, layout="fan", open.angle=15)
p <- gheatmap(p, data_mean, colnames_angle=90, font.size=2, hjust=1, color="black", offset=1.3) + scale_fill_viridis_b()
p <- p %<+% d + geom_tiplab(aes(label=species), parse=F, size=2, align=T, linesize=.12)
p
Here we’re plotting colours based on the CLR abundance and showing the name of the lowest classification available for each ASV. We’ve just plotted the top 75 most abundant ASVs.
options(warn = -1)
library(microbiome)
library(phyloseq)
ps = physeq_clr
top75 = names(sort(taxa_sums(ps), TRUE))[1:75]
ps = prune_taxa(top75, ps)
tree = phy_tree(ps)
asv = otu_table(ps)
tax = tax_table(ps)
detach("package:microbiome", unload=TRUE, force=TRUE)
detach("package:phyloseq", unload=TRUE, force=TRUE)
library("ggtree")
asv_df = as.data.frame(asv)
tax_df = as.data.frame(tax, stringsAsFactors=F)
data_plot = asv_df[,sample_names(ps)]
rns <- rownames(samples)
locs <- samples[["Location"]]
cn <- colnames(data_plot)
rename = c()
for (a in 1:length(cn)) {
for (b in 1:length(rns)) {
if (cn[a] == rns[b]) {
rename = c(rename, as.character(locs[b]))
}
}
}
colnames(data_plot) = rename
data_mean <- as.data.frame(sapply(unique(names(data_plot)), function(col) rowMeans(data_plot[names(data_plot) == col])))
asv_tax_full <- merge(asv_df, tax_df, by="row.names")
asv_tax <- asv_tax_full[,c("Row.names", "ta1", "ta2", "ta3", "ta4", "ta5", "ta6", "ta7")]
levels = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
colnames(asv_tax) <- c("label", levels)
levels = levels[7:1]
labels = tree$tip.label
species_name = c()
for (i in 1:length(labels)) {
for (j in 1:length(asv_tax$label)) {
if (labels[i] == asv_tax$label[j]) {
if (is.na(asv_tax$Species[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Species[j])
} else if (is.na(asv_tax$Genus[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Genus[j])
} else if (is.na(asv_tax$Family[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Family[j])
} else if (is.na(asv_tax$Order[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Order[j])
} else if (is.na(asv_tax$Class[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Class[j])
} else if (is.na(asv_tax$Phylum[j]) == FALSE) {
species_name <- c(species_name, asv_tax$Phylum[j])
} else {
species_name <- c(species_name, asv_tax$Domain[j])
}
}
}
}
d <- data.frame(label=tree$tip.label, species=species_name)
p <- ggtree(tree, layout="fan", open.angle=15)
p <- gheatmap(p, data_mean, colnames_angle=90, font.size=2, hjust=1, color="black", offset=1.3) + scale_fill_viridis_b()
p <- p %<+% d + geom_tiplab(aes(label=species), parse=F, size=2, align=T, linesize=.12)
p
Metacoder is an R package that allows you to plot differential abundance between groups across all phylogenetic levels. The plots look kind of like a phylogenetic tree - they are ordered by phylogeny, but the branch legnths aren’t related to how each taxon is related to one another. Each node corresponds to a taxonomic level. The larger the node, the larger the number of ASVs within that node. Each edge is coloured by whether there is a statistically significant difference in the abundance between the two treatments being compared. Whether a taxon is considered differentially abundant is based on a Wilcoxon rank sum test p<0.05 and the fold change shown is between the two centered log ratios.
Sort the phyloseq objects:
options(warn = -1)
asvs = otu_table(physeq_clr)
tax = data.frame(tax_table(physeq_clr))
tax_new = apply(tax, MARGIN=c(1,2), function(x) (strsplit(x, "__")[[1]][2]))
tax_new_coll = data.frame(tax_new)
tax_new_coll$lineage = paste(tax_new_coll$ta1, tax_new_coll$ta2, tax_new_coll$ta3, tax_new_coll$ta4, tax_new_coll$ta5, tax_new_coll$ta6, tax_new_coll$ta7, sep=";")
tax_new_coll = tax_new_coll[c("lineage")]
asv_tax = merge(asvs, tax_new_coll, by="row.names")
x <- parse_phyloseq(physeq_clr)
Here, node sizes are based on how many ASVs fall underneath that phylogeny while colors are based on the CLR abundance.
Calculate:
options(warn = -1)
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Treatment[j] == "Control") {
keeping = c(keeping, cn[i])
group = c(group, "Control")
} else {
keeping = c(keeping, cn[i])
group = c(group, "Treatment")
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) mean(x))
options(warn = -1)
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=value, layout="davidson-harel", initial_layout="reingold-tilford")
options(warn = -1)
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) mean(y))
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=value, layout="davidson-harel", initial_layout="reingold-tilford")
Node sizes are again based on the number of ASVs and colours are based on the fold change between the two values.
Calculate:
options(warn = -1)
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Treatment[j] == "Control") {
keeping = c(keeping, cn[i])
group = c(group, "Control")
} else {
keeping = c(keeping, cn[i])
group = c(group, "Treatment")
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == "Site2") {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Treatment[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == "Site4") {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Treatment[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site1"
s2 = "Site2"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site1"
s2 = "Site3"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site1"
s2 = "Site4"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site1"
s2 = "Site5"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site2"
s2 = "Site3"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site2"
s2 = "Site4"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site2"
s2 = "Site5"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site3"
s2 = "Site4"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site3"
s2 = "Site5"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
Calculate:
options(warn = -1)
s1 = "Site4"
s2 = "Site5"
obj <- parse_tax_data(asv_tax, class_cols = "lineage", class_sep = ";")
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data")
obj$data$sample_data = x$data$sample_data
set.seed(1)
cn = colnames(obj$data$tax_data)
rn = rownames(obj$data$sample_data)
keeping = c()
group = c()
for (i in 2:length(cn)) {
for (j in 1:length(rn)) {
if (obj$data$sample_data$sample_id[j] == cn[i]) {
if (obj$data$sample_data$Location[j] == s1) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
} else if (obj$data$sample_data$Location[j] == s2) {
keeping = c(keeping, cn[i])
group = c(group, obj$data$sample_data$Location[j])
}
}
}
}
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund", cols=keeping, groups=group, func=function(x, y) {
log_ratio <- median(x) - median(y)
if (is.nan(log_ratio)) {
log_ratio <- 0
}
list(log2_median_ratio = log_ratio,
median_diff = median(x) - median(y),
mean_diff = mean(x) - mean(y),
wilcox_p_value = wilcox.test(x, y)$p.value)
})
Fold change only:
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
With testing for significant differences (now the tree is only coloured if differences are significant):
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method="fdr")
range(obj$data$diff_table$wilcox_p_value, finite=TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
heat_tree(obj, node_label=taxon_names, node_size=n_obs, node_color=log2_median_ratio, layout="davidson-harel", initial_layout="reingold-tilford")
This is another method for testing differential abundance
So the resulting plot shows a point for the ASVs that DeSeq finds to be significantly differentially abundant (adjusted p < 0.05).
options(warn = -1)
ps = physeq
sample_data(ps)$Treatment <- as.factor(sample_data(ps)$Treatment)
ds = phyloseq_to_deseq2(ps, ~ Treatment)
ds = DESeq(ds)
alpha = 0.05
res = results(ds, contrast=c("Treatment", "Treatment", "Control"), alpha=alpha)
res = res[order(res$padj, na.last=NA), ]
res_sig = res[(res$padj < alpha), ]
res_sig = cbind(as(res_sig, "data.frame"), as(tax_table(ps)[rownames(res_sig), ], "matrix"))
ggplot(res_sig, aes(x=ta4, y=log2FoldChange, color=ta2)) +
geom_jitter(size=3, width = 0.2) +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
As we’re looking at fold change we can only look at two at a time, so I’ve selected the two sites where we have samples from both the treatment and control groups.
options(warn = -1)
samples_keeping = c("Batch2DNA1", "Batch2DNA2","Batch2DNA3","Batch2DNA7","Batch2DNA8","Batch2DNA9")
ps = prune_samples(samples_keeping, physeq)
sample_data(ps)$Treatment <- as.factor(sample_data(ps)$Treatment)
ds = phyloseq_to_deseq2(ps, ~ Treatment)
ds = DESeq(ds)
alpha = 0.05
res = results(ds, contrast=c("Treatment", "Control", "Treatment"), alpha=alpha)
res = res[order(res$padj, na.last=NA), ]
res_sig = res[(res$padj < alpha), ]
res_sig = cbind(as(res_sig, "data.frame"), as(tax_table(ps)[rownames(res_sig), ], "matrix"))
ggplot(res_sig, aes(x=ta4, y=log2FoldChange, color=ta2)) +
geom_jitter(size=3, width = 0.2) +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
options(warn = -1)
samples_keeping = c("DNA10","DNA11","DNA12","DNA13","DNA14","DNA15")
ps = prune_samples(samples_keeping, physeq)
sample_data(ps)$Treatment <- as.factor(sample_data(ps)$Treatment)
ds = phyloseq_to_deseq2(ps, ~ Treatment)
ds = DESeq(ds)
alpha = 0.05
res = results(ds, contrast=c("Treatment", "Control", "Treatment"), alpha=alpha)
res = res[order(res$padj, na.last=NA), ]
res_sig = res[(res$padj < alpha), ]
res_sig = cbind(as(res_sig, "data.frame"), as(tax_table(ps)[rownames(res_sig), ], "matrix"))
ggplot(res_sig, aes(x=ta4, y=log2FoldChange, color=ta2)) +
geom_jitter(size=3, width = 0.2) +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))